{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "9678e0bc-97cd-45bc-bd38-8d79c6789325",
   "metadata": {
    "ExecutionIndicator": {
     "show": true
    },
    "tags": []
   },
   "outputs": [],
   "source": [
    "# install required packages\n",
    "!pip install langchain\n",
    "!pip install unstructured\n",
    "!pip install transformers_stream_generator"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "36410a7c-a334-4ba2-abde-1679ac938a2a",
   "metadata": {
    "ExecutionIndicator": {
     "show": true
    },
    "tags": []
   },
   "outputs": [],
   "source": [
    "import os\n",
    "from typing import List, Optional\n",
    "from langchain.llms.base import LLM\n",
    "from modelscope import AutoModelForCausalLM, AutoTokenizer\n",
    "from modelscope import GenerationConfig\n",
    "\n",
    "# initialize qwen 7B model\n",
    "tokenizer = AutoTokenizer.from_pretrained(\"qwen/Qwen-7B-Chat\", revision = 'v1.0.5',trust_remote_code=True)\n",
    "model = AutoModelForCausalLM.from_pretrained(\"qwen/Qwen-7B-Chat\", revision = 'v1.0.5',device_map=\"auto\", trust_remote_code=True, fp16=True).eval()\n",
    "model.generation_config = GenerationConfig.from_pretrained(\"Qwen/Qwen-7B-Chat\",revision = 'v1.0.5', trust_remote_code=True) \n",
    "\n",
    "\n",
    "# torch garbage collection\n",
    "def torch_gc():\n",
    "    os.environ[\"TOKENIZERS_PARALLELISM\"] = \"false\"\n",
    "    DEVICE = \"cuda\"\n",
    "    DEVICE_ID = \"0\"\n",
    "    CUDA_DEVICE = f\"{DEVICE}:{DEVICE_ID}\" if DEVICE_ID else DEVICE\n",
    "    a = torch.Tensor([1, 2])\n",
    "    a = a.cuda()\n",
    "    print(a)\n",
    "\n",
    "    if torch.cuda.is_available():\n",
    "        with torch.cuda.device(CUDA_DEVICE):\n",
    "            torch.cuda.empty_cache()\n",
    "            torch.cuda.ipc_collect()\n",
    "\n",
    "# wrap the qwen model with langchain LLM base class\n",
    "class QianWenChatLLM(LLM):\n",
    "    max_length = 10000\n",
    "    temperature: float = 0.01\n",
    "    top_p = 0.9\n",
    "\n",
    "    def __init__(self):\n",
    "        super().__init__()\n",
    "\n",
    "    @property\n",
    "    def _llm_type(self):\n",
    "        return \"ChatLLM\"\n",
    "\n",
    "    def _call(self, prompt: str, stop: Optional[List[str]] = None) -> str:\n",
    "        print(prompt)\n",
    "        response, history = model.chat(tokenizer, prompt, history=None)\n",
    "        torch_gc()\n",
    "        return response\n",
    "    \n",
    "# create the qwen llm\n",
    "qwllm = QianWenChatLLM()\n",
    "print('@@@ qianwen LLM created')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "ce46aa8d-d772-4990-b748-12872fac2473",
   "metadata": {
    "ExecutionIndicator": {
     "show": true
    },
    "execution": {
     "iopub.execute_input": "2023-08-11T03:49:17.451327Z",
     "iopub.status.busy": "2023-08-11T03:49:17.450867Z",
     "iopub.status.idle": "2023-08-11T03:49:18.960037Z",
     "shell.execute_reply": "2023-08-11T03:49:18.959128Z",
     "shell.execute_reply.started": "2023-08-11T03:49:17.451304Z"
    },
    "tags": []
   },
   "outputs": [],
   "source": [
    "import os\n",
    "import re\n",
    "import torch\n",
    "\n",
    "from typing import Any, List\n",
    "from pydantic import BaseModel, Extra\n",
    "from langchain.chains import RetrievalQA\n",
    "from langchain.document_loaders import UnstructuredFileLoader,TextLoader\n",
    "from langchain.embeddings.base import Embeddings\n",
    "from langchain.prompts import PromptTemplate\n",
    "from langchain.text_splitter import CharacterTextSplitter\n",
    "from langchain.vectorstores import FAISS\n",
    "\n",
    "# define chinese text split logic for divided docs into reasonable size\n",
    "class ChineseTextSplitter(CharacterTextSplitter):\n",
    "    def __init__(self, pdf: bool = False, sentence_size: int = 100, **kwargs):\n",
    "        super().__init__(**kwargs)\n",
    "        self.pdf = pdf\n",
    "        self.sentence_size = sentence_size\n",
    "\n",
    "    def split_text(self, text: str) -> List[str]:   \n",
    "        if self.pdf:\n",
    "            text = re.sub(r\"\\n{3,}\", r\"\\n\", text)\n",
    "            text = re.sub('\\s', \" \", text)\n",
    "            text = re.sub(\"\\n\\n\", \"\", text)\n",
    "\n",
    "        text = re.sub(r'([;；.!?。！？\\?])([^”’])', r\"\\1\\n\\2\", text)  # 单字符断句符\n",
    "        text = re.sub(r'(\\.{6})([^\"’”」』])', r\"\\1\\n\\2\", text)  # 英文省略号\n",
    "        text = re.sub(r'(\\…{2})([^\"’”」』])', r\"\\1\\n\\2\", text)  # 中文省略号\n",
    "        text = re.sub(r'([;；!?。！？\\?][\"’”」』]{0,2})([^;；!?，。！？\\?])', r'\\1\\n\\2', text)\n",
    "        # 如果双引号前有终止符，那么双引号才是句子的终点，把分句符\\n放到双引号后，注意前面的几句都小心保留了双引号\n",
    "        text = text.rstrip()  # 段尾如果有多余的\\n就去掉它\n",
    "        # 很多规则中会考虑分号;，但是这里我把它忽略不计，破折号、英文双引号等同样忽略，需要的再做些简单调整即可。\n",
    "        ls = [i for i in text.split(\"\\n\") if i]\n",
    "        for ele in ls:\n",
    "            if len(ele) > self.sentence_size:\n",
    "                ele1 = re.sub(r'([,，.][\"’”」』]{0,2})([^,，.])', r'\\1\\n\\2', ele)\n",
    "                ele1_ls = ele1.split(\"\\n\")\n",
    "                for ele_ele1 in ele1_ls:\n",
    "                    if len(ele_ele1) > self.sentence_size:\n",
    "                        ele_ele2 = re.sub(r'([\\n]{1,}| {2,}[\"’”」』]{0,2})([^\\s])', r'\\1\\n\\2', ele_ele1)\n",
    "                        ele2_ls = ele_ele2.split(\"\\n\")\n",
    "                        for ele_ele2 in ele2_ls:\n",
    "                            if len(ele_ele2) > self.sentence_size:\n",
    "                                ele_ele3 = re.sub('( [\"’”」』]{0,2})([^ ])', r'\\1\\n\\2', ele_ele2)\n",
    "                                ele2_id = ele2_ls.index(ele_ele2)\n",
    "                                ele2_ls = ele2_ls[:ele2_id] + [i for i in ele_ele3.split(\"\\n\") if i] + ele2_ls[\n",
    "                                                                                                       ele2_id + 1:]\n",
    "                        ele_id = ele1_ls.index(ele_ele1)\n",
    "                        ele1_ls = ele1_ls[:ele_id] + [i for i in ele2_ls if i] + ele1_ls[ele_id + 1:]\n",
    "\n",
    "                id = ls.index(ele)\n",
    "                ls = ls[:id] + [i for i in ele1_ls if i] + ls[id + 1:]\n",
    "        return ls\n",
    "\n",
    "\n",
    "# using modelscope text embedding method for embedding tool\n",
    "class ModelScopeEmbeddings(BaseModel, Embeddings):\n",
    "    embed: Any\n",
    "    model_id: str =\"damo/nlp_corom_sentence-embedding_english-base\"\n",
    "    \"\"\"Model name to use.\"\"\"\n",
    "\n",
    "    def __init__(self, **kwargs: Any):\n",
    "        \"\"\"Initialize the modelscope\"\"\"\n",
    "        super().__init__(**kwargs)\n",
    "        try:\n",
    "            from modelscope.models import Model\n",
    "            from modelscope.pipelines import pipeline\n",
    "            from modelscope.utils.constant import Tasks\n",
    "            self.embed = pipeline(Tasks.sentence_embedding,model=self.model_id)\n",
    "\n",
    "        except ImportError as e:\n",
    "            raise ValueError(\n",
    "                \"Could not import some python packages.\" \"Please install it with `pip install modelscope`.\"\n",
    "            ) from e\n",
    "\n",
    "    class Config:\n",
    "        extra = Extra.forbid\n",
    "\n",
    "    def embed_documents(self, texts: List[str]) -> List[List[float]]:\n",
    "        texts = list(map(lambda x: x.replace(\"\\n\", \" \"), texts))\n",
    "        inputs = {\"source_sentence\": texts}\n",
    "        embeddings = self.embed(input=inputs)['text_embedding']\n",
    "        return embeddings\n",
    "\n",
    "    def embed_query(self, text: str) -> List[float]:\n",
    "        text = text.replace(\"\\n\", \" \")\n",
    "        inputs = {\"source_sentence\": [text]}\n",
    "        embedding = self.embed(input=inputs)['text_embedding'][0]\n",
    "        return embedding\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "ca3dc051-1b0b-4bec-b082-6e94b220a34d",
   "metadata": {
    "execution": {
     "iopub.execute_input": "2023-08-10T06:44:05.671065Z",
     "iopub.status.busy": "2023-08-10T06:44:05.670720Z",
     "iopub.status.idle": "2023-08-10T06:44:05.674188Z",
     "shell.execute_reply": "2023-08-10T06:44:05.673699Z",
     "shell.execute_reply.started": "2023-08-10T06:44:05.671045Z"
    },
    "tags": []
   },
   "outputs": [],
   "source": [
    "# define prompt template\n",
    "prompt_template = \"\"\"请基于```内的内容回答问题。\"\n",
    "\t```\n",
    "\t{context}\n",
    "\t```\n",
    "\t我的问题是：{question}。\n",
    "\"\"\"\n",
    "\n",
    "prompt = PromptTemplate(template=prompt_template, input_variables=[\"context\", \"question\"])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "a41ff8b8-bf19-4766-8d90-af48c7dfda99",
   "metadata": {
    "ExecutionIndicator": {
     "show": true
    },
    "tags": []
   },
   "outputs": [],
   "source": [
    "# load the vector db and upsert docs with vector to db\n",
    "\n",
    "print('@@@ reading docs ...')\n",
    "sentence_size = 1600\n",
    "embeddings = ModelScopeEmbeddings(model_id=\"damo/nlp_corom_sentence-embedding_chinese-tiny\")\n",
    "\n",
    "filepath = \"../../../README_zh.md\"\n",
    "if filepath.lower().endswith(\".md\"):\n",
    "    loader = UnstructuredFileLoader(filepath, mode=\"elements\")\n",
    "    docs = loader.load()\n",
    "elif filepath.lower().endswith(\".txt\"):\n",
    "    loader = TextLoader(filepath, autodetect_encoding=True)\n",
    "    textsplitter = ChineseTextSplitter(pdf=False, sentence_size=sentence_size)\n",
    "    docs = loader.load_and_split(textsplitter) \n",
    "\n",
    "db = FAISS.from_documents(docs, embeddings)\n",
    "print('@@@ reading doc done, vec db created.')\n",
    "\n",
    "\n",
    "# create knowledge chain\n",
    "kc = RetrievalQA.from_llm(llm=qwllm, retriever=db.as_retriever(search_kwargs={\"k\": 6}), prompt=prompt)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "c97b1a9e-6260-4429-8411-a3a2cddadb05",
   "metadata": {
    "ExecutionIndicator": {
     "show": false
    },
    "execution": {
     "iopub.execute_input": "2023-08-06T06:14:23.817772Z",
     "iopub.status.busy": "2023-08-06T06:14:23.817192Z",
     "iopub.status.idle": "2023-08-06T06:14:27.775706Z",
     "shell.execute_reply": "2023-08-06T06:14:27.775194Z",
     "shell.execute_reply.started": "2023-08-06T06:14:23.817734Z"
    },
    "tags": []
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "请基于```内的内容回答问题。\"\n",
      "\t```\n",
      "\tContext:\n",
      "ModelScope Library为模型贡献者提供了必要的分层API，以便将来自 CV、NLP、语音、多模态以及科学计算的模型集成到ModelScope生态系统中。所有这些不同模型的实现都以一种简单统一访问的方式进行封装，用户只需几行代码即可完成模型推理、微调和评估。同时，灵活的模块化设计使得在必要时也可以自定义模型训练推理过程中的不同组件。\n",
      "\n",
      "Context:\n",
      "ModelScope 是一个“模型即服务”(MaaS)平台，旨在汇集来自AI社区的最先进的机器学习模型，并简化在实际应用中使用AI模型的流程。ModelScope库使开发人员能够通过丰富的API设计执行推理、训练和评估，从而促进跨不同AI领域的最先进模型的统一体验。\n",
      "\n",
      "Context:\n",
      "除了包含各种模型的实现之外，ModelScope Library还支持与ModelScope后端服务进行必要的交互，特别是与Model-Hub和Dataset-Hub的交互。这种交互促进了模型和数据集的管理在后台无缝执行，包括模型数据集查询、版本控制、缓存管理等。\n",
      "\t```\n",
      "\t我的问题是：modelscope是什么？。\n",
      "\n",
      "tensor([1., 2.], device='cuda:0')\n"
     ]
    }
   ],
   "source": [
    "# test the knowledge chain\n",
    "query = 'modelscope是什么？'\n",
    "result = kc({\"query\": query})\n",
    "print(result)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.16"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
